1 预处理

# 读取数据,去除 id 和 day 列
data <- read.csv("train.csv")
data_selected <- data %>% select(-id, -day)

# 分离 rainfall,其他变量做标准化处理
rainfall <- data_selected$rainfall
predictors <- data_selected %>% select(-rainfall)
predictors_scaled <- scale(predictors)

1.1 特征组合

var_names <- names(predictors)
results <- list()

# 枚举所有至少包含两个变量的组合
for (k in 2:length(var_names)) {
  var_combos <- combn(var_names, k, simplify = FALSE)
  for (combo in var_combos) {
    subset_data <- predictors[, combo]
    subset_scaled <- scale(subset_data)
    
    set.seed(123)
    km <- kmeans(subset_scaled, centers = 2)
    cluster <- as.factor(km$cluster)
    
    # 比较两种标签映射方式,取更高准确率
    acc1 <- mean(cluster == rainfall)
    acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
    accuracy <- max(acc1, acc2)
    
    results[[paste(combo, collapse = "+")]] <- list(
      variables = combo,
      accuracy = accuracy
    )
  }
}

# 整理并排序准确率结果
accuracy_df <- tibble(
  variables = names(results),
  accuracy = map_dbl(results, ~ .x$accuracy)
) %>%
  arrange(desc(accuracy))

# 输出前10准确率最高的变量组合
head(accuracy_df, 10)
## # A tibble: 10 × 2
##    variables                     accuracy
##    <chr>                            <dbl>
##  1 cloud+winddirection              0.698
##  2 dewpoint+cloud                   0.697
##  3 dewpoint+cloud+winddirection     0.697
##  4 pressure+cloud                   0.697
##  5 pressure+dewpoint+cloud          0.696
##  6 cloud+windspeed                  0.687
##  7 dewpoint+cloud+windspeed         0.684
##  8 pressure+cloud+windspeed         0.680
##  9 pressure+cloud+winddirection     0.679
## 10 cloud+winddirection+windspeed    0.676

2 k-means

2.1 Original

subset_vars <- predictors[, c("cloud", "winddirection")]
subset_scaled <- scale(subset_vars)

set.seed(123)
km <- kmeans(subset_scaled, centers = 2)
cluster <- as.factor(km$cluster)

cat("各簇大小:\n")
## 各簇大小:
print(table(cluster))
## cluster
##    1    2 
## 1714  476
cat("\n簇中心(标准化变量):\n")
## 
## 簇中心(标准化变量):
print(km$centers)
##        cloud winddirection
## 1  0.4669527   -0.02683802
## 2 -1.6814222    0.09663942
cat("\n聚类标签与真实rainfall混淆矩阵:\n")
## 
## 聚类标签与真实rainfall混淆矩阵:
print(table(Cluster = cluster, Rainfall = rainfall))
##        Rainfall
## Cluster    0    1
##       1  186 1528
##       2  354  122
acc1 <- mean(cluster == rainfall)
acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
cat("\n两种映射准确率:\n")
## 
## 两种映射准确率:
cat("直接匹配:", acc1, "\n")
## 直接匹配: 0.6977169
cat("标签反转:", acc2, "\n")
## 标签反转: 0.05570776
plot_df <- as.data.frame(subset_scaled) %>%
  mutate(Cluster = cluster, Rainfall = as.factor(rainfall))

ggplot(plot_df, aes(x = cloud, y = winddirection, color = Cluster, shape = Rainfall)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(title = "Cloud和Winddirection的K-means聚类结果",
       x = "Cloud (标准化)", y = "Winddirection (标准化)") +
  theme_minimal()

2.2 PCA

set.seed(123)
kmeans_result <- kmeans(predictors_scaled, centers = 2)

# PCA降维到2维和3维
pca_sample <- prcomp(predictors_scaled, center = FALSE, scale. = FALSE)
pca_2d <- as.data.frame(pca_sample$x[, 1:2])
pca_3d <- as.data.frame(pca_sample$x[, 1:3])

names(pca_2d) <- c("PC1", "PC2")
pca_2d$cluster <- as.factor(kmeans_result$cluster)
pca_2d$rainfall <- as.factor(rainfall)

# 创建聚类与雨量标签组合变量
pca_2d <- pca_2d %>%
  mutate(combo = case_when(
    cluster == "1" & rainfall == "1" ~ "聚类1_下雨",
    cluster == "1" & rainfall == "0" ~ "聚类1_不下雨",
    cluster == "2" & rainfall == "1" ~ "聚类2_下雨",
    cluster == "2" & rainfall == "0" ~ "聚类2_不下雨"
  ))

color_map <- c(
  "聚类1_下雨" = "#1f77b4",
  "聚类1_不下雨" = "#d62728",
  "聚类2_下雨" = "#ff7f0e",
  "聚类2_不下雨" = "purple"
)

ggplot(pca_2d, aes(x = PC1, y = PC2, color = combo)) +
  geom_point(size = 2) +
  scale_color_manual(values = color_map) +
  theme_minimal() +
  labs(title = "K-means聚类与雨量组合的二维PCA散点图",
       color = "组合类别")

# 对标准化变量做PCA
pca_variable <- prcomp(predictors_scaled, center = FALSE, scale. = FALSE)

# 查看主成分方差解释情况
summary(pca_variable)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3466 1.4770 0.92917 0.70666 0.64952 0.49454 0.42164
## Proportion of Variance 0.5507 0.2181 0.08633 0.04994 0.04219 0.02446 0.01778
## Cumulative Proportion  0.5507 0.7688 0.85514 0.90507 0.94726 0.97172 0.98950
##                           PC8     PC9    PC10
## Standard deviation     0.2627 0.16686 0.09041
## Proportion of Variance 0.0069 0.00278 0.00082
## Cumulative Proportion  0.9964 0.99918 1.00000
# 提取前三个主成分的变量载荷(贡献)
loadings <- as.data.frame(pca_variable$rotation[, 1:3])
loadings$var <- rownames(loadings)

# 构建3D箭头起止点数据,用于绘制变量投影线
arrow_lines <- data.frame(
  x = c(rbind(0, loadings$PC1)),
  y = c(rbind(0, loadings$PC2)),
  z = c(rbind(0, loadings$PC3)),
  group = rep(1:nrow(loadings), each = 2)
)

# 绘制变量在前三主成分空间的3D投影箭头图
plot_ly() %>%
  add_trace(
    data = arrow_lines,
    type = "scatter3d",
    mode = "lines",
    x = ~x, y = ~y, z = ~z,
    split = ~group,
    line = list(color = 'gray', width = 4),
    showlegend = FALSE
  ) %>%
  add_trace(
    data = loadings,
    type = "scatter3d",
    mode = "text",
    x = ~PC1, y = ~PC2, z = ~PC3,
    text = ~var,
    textposition = "top center",
    textfont = list(size = 12),
    showlegend = FALSE
  ) %>%
  layout(
    title = "变量主成分投影(3D箭头图)",
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )
# 添加降雨状态标签到3D PCA数据
rainfall_factor <- factor(rainfall, levels = c(0,1), labels = c("不下雨", "下雨"))
pca_3d$rainfall <- rainfall_factor

# 绘制样本在前三主成分空间中的3D散点图,颜色区分降雨状态
fig <- plot_ly(
  pca_3d,
  x = ~PC1, y = ~PC2, z = ~PC3,
  color = ~rainfall,
  colors = c("#1f77b4", "#d62728"),
  type = 'scatter3d',
  mode = 'markers',
  marker = list(size = 3, opacity = 0.8)
) %>%
  layout(
    title = "样本主成分空间中的降雨状态",
    scene = list(
      xaxis = list(title = 'PC1'),
      yaxis = list(title = 'PC2'),
      zaxis = list(title = 'PC3')
    ),
    legend = list(title = list(text = "<b>降雨状态</b>"))
  )

fig
pca_top3 <- as.data.frame(pca_variable$x[, 1:3])

set.seed(123)
km_pca <- kmeans(pca_top3, centers = 2)
cluster <- as.factor(km_pca$cluster)

# 计算两种标签映射的准确率,取最大值
acc1 <- mean(cluster == rainfall)
acc2 <- mean(rev(levels(cluster))[cluster] == rainfall)
accuracy <- max(acc1, acc2)

cat("使用前三主成分进行K-means聚类的准确率:", round(accuracy, 4), "\n")
## 使用前三主成分进行K-means聚类的准确率: 0.4438

3 因子分析

# 使用平行分析辅助确定因子数量
fa.parallel(predictors_scaled, fa = "fa")

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA
# 进行因子分析,提取3个因子,旋转方式为varimax,估计方法为最大似然
fa_result <- fa(predictors_scaled, nfactors = 3, rotate = "varimax", fm = "ml")

# 查看因子载荷矩阵(只显示载荷大于0.3的项)
print(fa_result$loadings, cutoff = 0.3)
## 
## Loadings:
##               ML1    ML2    ML3   
## pressure      -0.834              
## maxtemp        0.962              
## temparature    0.985              
## mintemp        0.984              
## dewpoint       0.960              
## humidity              0.695       
## cloud                 0.906       
## sunshine             -0.841       
## winddirection  0.670              
## windspeed     -0.322              
## 
##                  ML1   ML2   ML3
## SS loadings    5.139 2.126 0.113
## Proportion Var 0.514 0.213 0.011
## Cumulative Var 0.514 0.726 0.738
# 查看部分因子得分
head(fa_result$scores)
##             ML1        ML2        ML3
## [1,] -0.4947246  0.8069461  0.8239284
## [2,] -1.2090124  1.0603347  1.1394742
## [3,] -1.7987775 -1.8475852 -0.1952701
## [4,] -0.9655903  1.2514258  1.1623847
## [5,] -1.5022529 -1.9457799 -1.9429474
## [6,] -1.1017496  0.1098068 -1.1701963
# 将因子得分转换为数据框,便于后续聚类或分析
factor_scores <- as.data.frame(fa_result$scores)
# 先做10因子因子分析,查看方差解释比例
fa_result <- fa(predictors_scaled, nfactors = 10, rotate = "varimax", fm = "ml")
fa_result$Vaccounted
##                             ML1       ML2        ML5         ML4         ML3
## SS loadings           5.0033893 2.0102884 0.23655106 0.079292109 0.064380994
## Proportion Var        0.5003389 0.2010288 0.02365511 0.007929211 0.006438099
## Cumulative Var        0.5003389 0.7013678 0.72502287 0.732952081 0.739390181
## Proportion Explained  0.6766913 0.2718846 0.03199272 0.010723987 0.008707310
## Cumulative Proportion 0.6766913 0.9485760 0.98056870 0.991292690 1.000000000
##                                ML6          ML8          ML7          ML9
## SS loadings           7.583910e-30 7.583910e-30 7.583910e-30 7.583909e-30
## Proportion Var        7.583910e-31 7.583910e-31 7.583910e-31 7.583909e-31
## Cumulative Var        7.393902e-01 7.393902e-01 7.393902e-01 7.393902e-01
## Proportion Explained  1.025698e-30 1.025698e-30 1.025698e-30 1.025698e-30
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
##                               ML10
## SS loadings           7.583908e-30
## Proportion Var        7.583908e-31
## Cumulative Var        7.393902e-01
## Proportion Explained  1.025698e-30
## Cumulative Proportion 1.000000e+00
# 再选3因子做因子分析
fa_result <- fa(predictors_scaled, nfactors = 3, rotate = "varimax", fm = "ml")

# 提取因子载荷矩阵,转为数据框
loadings_matrix <- as.data.frame(unclass(fa_result$loadings))
loadings_matrix$Variable <- rownames(loadings_matrix)

# 转换为长格式方便绘图
loadings_long <- melt(loadings_matrix, id.vars = "Variable", 
                      variable.name = "Factor", value.name = "Loading")

# 绘制因子载荷热力图,含载荷数值标签
ggplot(loadings_long, aes(x = Factor, y = Variable, fill = Loading)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Loading, 2)), size = 4) +
  scale_fill_gradient2(low = "#d7191c", high = "#1a9641", mid = "white",
                       midpoint = 0, limit = c(-1, 1), name = "载荷值") +
  theme_minimal(base_size = 14) +
  labs(title = "因子载荷热力图(含数值)", x = "因子", y = "变量") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 提取前两个因子的因子得分
fa_result <- fa(predictors_scaled, nfactors = 2, rotate = "varimax", fm = "ml")
factor_scores <- as.data.frame(fa_result$scores[, 1:2])
names(factor_scores) <- c("Factor1", "Factor2")

# 对因子得分进行K-means聚类(2类)
set.seed(123)
kmeans_result <- kmeans(factor_scores, centers = 2)

# 添加聚类标签和实际rainfall标签
factor_scores$cluster <- as.factor(kmeans_result$cluster)
factor_scores$rainfall <- as.factor(rainfall)

# 查看聚类结果与真实降雨的对应关系
table(聚类 = factor_scores$cluster, 实际下雨 = factor_scores$rainfall)
##     实际下雨
## 聚类    0    1
##    1  116 1395
##    2  424  255
# 输出聚类中心
kmeans_result$centers
##       Factor1    Factor2
## 1 -0.08083072  0.5515965
## 2  0.17987514 -1.2274850
factor_scores <- factor_scores %>%
  mutate(
    label = case_when(
      cluster == "1" & rainfall == "1" ~ "聚类1_下雨",
      cluster == "1" & rainfall == "0" ~ "聚类1_不下雨",
      cluster == "2" & rainfall == "1" ~ "聚类2_下雨",
      cluster == "2" & rainfall == "0" ~ "聚类2_不下雨"
    )
  )

label_colors <- c(
  "聚类1_下雨" = "#1f77b4",
  "聚类1_不下雨" = "#d62728",
  "聚类2_下雨" = "#ff7f0e",
  "聚类2_不下雨" = "purple"
)

ggplot(factor_scores, aes(x = Factor1, y = Factor2, color = label)) +
  geom_point(size = 2.5, alpha = 0.8) +
  scale_color_manual(values = label_colors) +
  labs(
    title = "因子空间中 K-means 聚类与实际rainfall状态对比",
    x = "Factor 1",
    y = "Factor 2",
    color = "聚类 + 实际"
  ) +
  theme_minimal()

4 模型评估

# 构建数据框(标准化预测变量 + 因变量)
logit_data <- as.data.frame(predictors_scaled)
logit_data$rainfall <- as.factor(rainfall)

# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(logit_data)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- logit_data[train_index, ]
test_data <- logit_data[-train_index, ]

# 训练逻辑回归模型
logit_model <- glm(rainfall ~ ., data = train_data, family = binomial)

# 训练集预测及分类
predicted_prob_train <- predict(logit_model, newdata = train_data, type = "response")
actual_train <- as.numeric(as.character(train_data$rainfall))
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)

# 测试集预测及分类
predicted_prob_test <- predict(logit_model, newdata = test_data, type = "response")
actual_test <- as.numeric(as.character(test_data$rainfall))
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)

# 训练集混淆矩阵和准确率
cat("\n=== 训练集性能 ===\n")
## 
## === 训练集性能 ===
confusion_matrix_train <- table(Predicted = predicted_class_train, Actual = actual_train)
print(confusion_matrix_train)
##          Actual
## Predicted    0    1
##         0  288   77
##         1  147 1240
accuracy_train <- mean(predicted_class_train == actual_train)
cat("训练集准确率:", round(accuracy_train, 4), "\n")
## 训练集准确率: 0.8721
# 测试集混淆矩阵和准确率
cat("\n=== 测试集性能 ===\n")
## 
## === 测试集性能 ===
confusion_matrix_test <- table(Predicted = predicted_class_test, Actual = actual_test)
print(confusion_matrix_test)
##          Actual
## Predicted   0   1
##         0  65  27
##         1  40 306
accuracy_test <- mean(predicted_class_test == actual_test)
cat("测试集准确率:", round(accuracy_test, 4), "\n")
## 测试集准确率: 0.847
# 计算并绘制训练集和测试集的ROC曲线及AUC
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)

roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)

plot(roc_train, col = "blue", lwd = 2, main = "训练集与测试集 ROC 曲线")
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
                                 paste("测试集 AUC =", round(auc_test, 4))),
       col = c("blue", "red"), lwd = 2, bty = "n")

# 提取前三个主成分,构建数据框并添加标签
pca_df <- as.data.frame(pca_variable$x[, 1:3])
colnames(pca_df) <- c("PC1", "PC2", "PC3")
pca_df$rainfall <- as.factor(rainfall)

# 划分训练集(80%)和测试集(20%)
set.seed(111)
n <- nrow(pca_df)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- pca_df[train_index, ]
test_data <- pca_df[-train_index, ]

# 基于前三主成分训练逻辑回归模型
logit_pca_model <- glm(rainfall ~ PC1 + PC2 + PC3, data = train_data, family = binomial)

# 训练集预测及分类
predicted_prob_train <- predict(logit_pca_model, newdata = train_data, type = "response")
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)
actual_train <- as.numeric(as.character(train_data$rainfall))

# 测试集预测及分类
predicted_prob_test <- predict(logit_pca_model, newdata = test_data, type = "response")
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)
actual_test <- as.numeric(as.character(test_data$rainfall))

# 计算并打印训练和测试集准确率
accuracy_train <- mean(predicted_class_train == actual_train)
cat("训练集准确率:", round(accuracy_train, 4), "\n")
## 训练集准确率: 0.8704
accuracy_test <- mean(predicted_class_test == actual_test)
cat("测试集准确率:", round(accuracy_test, 4), "\n")
## 测试集准确率: 0.8379
# 打印混淆矩阵
cat("训练集混淆矩阵:\n")
## 训练集混淆矩阵:
print(table(Predicted = predicted_class_train, Actual = actual_train))
##          Actual
## Predicted    0    1
##         0  280   80
##         1  147 1245
cat("测试集混淆矩阵:\n")
## 测试集混淆矩阵:
print(table(Predicted = predicted_class_test, Actual = actual_test))
##          Actual
## Predicted   0   1
##         0  76  34
##         1  37 291
# 计算训练和测试集ROC曲线及AUC
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_train <- auc(roc_train)
roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_test <- auc(roc_test)

# 绘制ROC曲线
plot(roc_train, col = "darkgreen", lwd = 2, main = "PCA逻辑回归 ROC 曲线")
plot(roc_test, col = "orange", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
                                 paste("测试集 AUC =", round(auc_test, 4))),
       col = c("darkgreen", "orange"), lwd = 2, bty = "n")

set.seed(123)
n_repeats <- 30  # 重复实验次数
n <- nrow(pca_df)

# 初始化结果存储向量
train_accs <- numeric(n_repeats)
test_accs <- numeric(n_repeats)
train_aucs <- numeric(n_repeats)
test_aucs <- numeric(n_repeats)

for (i in 1:n_repeats) {
  # 随机划分训练集和测试集
  train_index <- sample(1:n, size = round(0.8 * n))
  train_data <- pca_df[train_index, ]
  test_data <- pca_df[-train_index, ]

  # 拟合逻辑回归模型
  model <- glm(rainfall ~ PC1 + PC2 + PC3, data = train_data, family = binomial)

  # 训练集预测及评估
  prob_train <- predict(model, newdata = train_data, type = "response")
  class_train <- ifelse(prob_train > 0.5, 1, 0)
  actual_train <- as.numeric(as.character(train_data$rainfall))
  train_accs[i] <- mean(class_train == actual_train)
  train_aucs[i] <- auc(roc(response = actual_train, predictor = prob_train))

  # 测试集预测及评估
  prob_test <- predict(model, newdata = test_data, type = "response")
  class_test <- ifelse(prob_test > 0.5, 1, 0)
  actual_test <- as.numeric(as.character(test_data$rainfall))
  test_accs[i] <- mean(class_test == actual_test)
  test_aucs[i] <- auc(roc(response = actual_test, predictor = prob_test))
}

# 输出训练集和测试集准确率及AUC的均值和标准差
cat("训练集准确率均值:", round(mean(train_accs), 4), " 标准差:", round(sd(train_accs), 4), "\n")
## 训练集准确率均值: 0.8652  标准差: 0.0042
cat("测试集准确率均值:", round(mean(test_accs), 4), " 标准差:", round(sd(test_accs), 4), "\n")
## 测试集准确率均值: 0.8625  标准差: 0.0155
cat("训练集 AUC 均值:", round(mean(train_aucs), 4), " 标准差:", round(sd(train_aucs), 4), "\n")
## 训练集 AUC 均值: 0.8934  标准差: 0.0044
cat("测试集 AUC 均值:", round(mean(test_aucs), 4), " 标准差:", round(sd(test_aucs), 4), "\n")
## 测试集 AUC 均值: 0.8867  标准差: 0.0184
set.seed(111) 

# 划分训练集(80%)和测试集(20%)
n <- nrow(predictors_scaled)
train_index <- sample(1:n, size = round(0.8 * n))
test_index <- setdiff(1:n, train_index)

# 提取训练集和测试集的自变量和因变量
train_predictors <- predictors_scaled[train_index, ]
test_predictors <- predictors_scaled[test_index, ]
train_rainfall <- rainfall[train_index]
test_rainfall <- rainfall[test_index]

# 在训练集上做因子分析,提取两个因子,旋转方式为varimax
fa_train <- fa(train_predictors, nfactors = 2, rotate = "varimax", fm = "ml")
train_scores <- as.data.frame(fa_train$scores[, 1:2])
names(train_scores) <- c("fa_Factor1", "fa_Factor2")
train_scores$rainfall <- as.factor(train_rainfall)

# 用训练集因子得分拟合逻辑回归模型
logit_fa_model <- glm(rainfall ~ fa_Factor1 + fa_Factor2, data = train_scores, family = binomial)

# 训练集预测和评估
prob_train <- predict(logit_fa_model, type = "response")
class_train <- ifelse(prob_train > 0.5, 1, 0)
actual_train <- as.numeric(as.character(train_rainfall))
train_accuracy <- mean(class_train == actual_train)
train_auc <- auc(roc(response = actual_train, predictor = prob_train))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 用训练集载荷对测试集数据做因子得分
test_scores <- as.data.frame(scale(test_predictors) %*% fa_train$loadings[, 1:2])
names(test_scores) <- c("fa_Factor1", "fa_Factor2")
test_scores$rainfall <- as.factor(test_rainfall)

# 测试集预测和评估
prob_test <- predict(logit_fa_model, newdata = test_scores, type = "response")
class_test <- ifelse(prob_test > 0.5, 1, 0)
actual_test <- as.numeric(as.character(test_rainfall))
test_accuracy <- mean(class_test == actual_test)
test_auc <- auc(roc(response = actual_test, predictor = prob_test))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 绘制训练集和测试集ROC曲线
roc_train <- roc(response = actual_train, predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_test <- roc(response = actual_test, predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "darkgreen", lwd = 2, main = "因子分析逻辑回归:训练集与测试集 ROC 曲线")
lines(roc_test, col = "darkred", lwd = 2)
legend("bottomright",
       legend = c(paste("训练集 AUC =", round(train_auc, 4)),
                  paste("测试集 AUC =", round(test_auc, 4))),
       col = c("darkgreen", "darkred"), lwd = 2, bty = "n")

# 输出准确率和AUC
cat("训练集准确率:", round(train_accuracy, 4), "\n")
## 训练集准确率: 0.8733
cat("训练集 AUC:", round(train_auc, 4), "\n")
## 训练集 AUC: 0.8993
cat("测试集准确率:", round(test_accuracy, 4), "\n")
## 测试集准确率: 0.8333
cat("测试集 AUC:", round(test_auc, 4), "\n")
## 测试集 AUC: 0.8563

5 SVM

# 构建数据框(标准化变量 + 标签)
logit_data <- as.data.frame(predictors_scaled)
logit_data$rainfall <- as.factor(rainfall)

# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(logit_data)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- logit_data[train_index, ]
test_data <- logit_data[-train_index, ]

# 训练逻辑回归模型
logit_model <- glm(rainfall ~ ., data = train_data, family = binomial)

# 训练集预测及评估
cat("\n=== 训练集性能 ===\n")
## 
## === 训练集性能 ===
predicted_prob_train <- predict(logit_model, newdata = train_data, type = "response")
actual_train <- as.numeric(as.character(train_data$rainfall))
predicted_class_train <- ifelse(predicted_prob_train > 0.5, 1, 0)

confusion_train <- confusionMatrix(as.factor(predicted_class_train), as.factor(actual_train), positive = "1")
print(confusion_train$table)
##           Reference
## Prediction    0    1
##          0  288   77
##          1  147 1240
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8721
cat("训练集 AUC:", round(auc(roc(response = actual_train, predictor = predicted_prob_train)), 4), "\n")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 训练集 AUC: 0.9016
cat("训练集分类指标(精确率/召回率/F1):\n")
## 训练集分类指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8940159 0.9415338 0.9171598
# 绘制训练集ROC曲线
roc_train <- roc(response = actual_train, predictor = predicted_prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_train, col = "blue", lwd = 2, main = "训练集与测试集 ROC 曲线")

# 测试集预测及评估
cat("\n=== 测试集性能 ===\n")
## 
## === 测试集性能 ===
predicted_prob_test <- predict(logit_model, newdata = test_data, type = "response")
actual_test <- as.numeric(as.character(test_data$rainfall))
predicted_class_test <- ifelse(predicted_prob_test > 0.5, 1, 0)

confusion_test <- confusionMatrix(as.factor(predicted_class_test), as.factor(actual_test), positive = "1")
print(confusion_test$table)
##           Reference
## Prediction   0   1
##          0  65  27
##          1  40 306
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.847
cat("测试集 AUC:", round(auc(roc(response = actual_test, predictor = predicted_prob_test)), 4), "\n")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 测试集 AUC: 0.8782
cat("测试集分类指标(精确率/召回率/F1):\n")
## 测试集分类指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8843931 0.9189189 0.9013255
# 绘制测试集ROC曲线(叠加在训练集曲线上)
roc_test <- roc(response = actual_test, predictor = predicted_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_test, col = "red", lwd = 2, add = TRUE)

# 添加图例
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc(roc_train), 4)),
                                 paste("测试集 AUC =", round(auc(roc_test), 4))),
       col = c("blue", "red"), lwd = 2, bty = "n")

# 准备数据框(前三个主成分 + 标签)
pca_df <- as.data.frame(pca_variable$x[, 1:3])
colnames(pca_df) <- c("PC1", "PC2", "PC3")
pca_df$rainfall <- as.factor(rainfall)

# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(pca_df)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- pca_df[train_index, ]
test_data <- pca_df[-train_index, ]

# 训练基于RBF核的SVM模型(支持概率输出)
svm_model <- svm(rainfall ~ ., data = train_data, kernel = "radial", probability = TRUE)

cat("\n=== 训练集性能 ===\n")
## 
## === 训练集性能 ===
predicted_train <- predict(svm_model, newdata = train_data, probability = TRUE)
prob_train <- attr(predicted_train, "probabilities")[,2]  # 正类概率
actual_train <- train_data$rainfall

confusion_train <- confusionMatrix(predicted_train, actual_train, positive = "1")
print(confusion_train$table)
##           Reference
## Prediction    0    1
##          0  273   60
##          1  162 1257
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8733
cat("训练集分类指标(精确率/召回率/F1):\n")
## 训练集分类指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8858351 0.9544419 0.9188596
roc_train <- roc(response = as.numeric(as.character(actual_train)), predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_train <- auc(roc_train)
cat("训练集 AUC:", round(auc_train, 4), "\n")
## 训练集 AUC: 0.8535
plot(roc_train, col = "blue", lwd = 2, main = "基于 PCA 的 SVM:训练集与测试集 ROC 曲线")

cat("\n=== 测试集性能 ===\n")
## 
## === 测试集性能 ===
predicted_test <- predict(svm_model, newdata = test_data, probability = TRUE)
prob_test <- attr(predicted_test, "probabilities")[,2]
actual_test <- test_data$rainfall

confusion_test <- confusionMatrix(predicted_test, actual_test, positive = "1")
print(confusion_test$table)
##           Reference
## Prediction   0   1
##          0  59  24
##          1  46 309
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.8402
cat("测试集分类指标(精确率/召回率/F1):\n")
## 测试集分类指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8704225 0.9279279 0.8982558
roc_test <- roc(response = as.numeric(as.character(actual_test)), predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_test <- auc(roc_test)
cat("测试集 AUC:", round(auc_test, 4), "\n")
## 测试集 AUC: 0.824
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
                                 paste("测试集 AUC =", round(auc_test, 4))),
       col = c("blue", "red"), lwd = 2, bty = "n")

# 构建数据集(取前两个因子得分 + 标签)
factor_scores <- as.data.frame(fa_result$scores[, 1:2])
colnames(factor_scores) <- c("Factor1", "Factor2")
factor_scores$rainfall <- as.factor(rainfall)

# 划分训练集(80%)和测试集(20%)
set.seed(1234567)
n <- nrow(factor_scores)
train_index <- sample(1:n, size = round(0.8 * n))
train_data <- factor_scores[train_index, ]
test_data <- factor_scores[-train_index, ]

# 训练基于径向核的SVM模型(支持概率预测)
svm_model <- svm(rainfall ~ ., data = train_data, kernel = "radial", probability = TRUE)

cat("\n=== 训练集性能 ===\n")
## 
## === 训练集性能 ===
predicted_train <- predict(svm_model, newdata = train_data, probability = TRUE)
prob_train <- attr(predicted_train, "probabilities")[,2]
actual_train <- train_data$rainfall

confusion_train <- confusionMatrix(predicted_train, actual_train, positive = "1")
print(confusion_train$table)
##           Reference
## Prediction    0    1
##          0  275   61
##          1  160 1256
cat("训练集准确率:", round(confusion_train$overall["Accuracy"], 4), "\n")
## 训练集准确率: 0.8739
cat("训练集指标(精确率/召回率/F1):\n")
## 训练集指标(精确率/召回率/F1):
print(confusion_train$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8870056 0.9536826 0.9191365
roc_train <- roc(response = as.numeric(as.character(actual_train)), predictor = prob_train)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_train <- auc(roc_train)
cat("训练集 AUC:", round(auc_train, 4), "\n")
## 训练集 AUC: 0.8403
plot(roc_train, col = "blue", lwd = 2, main = "基于因子得分的 SVM:训练集与测试集 ROC 曲线")

cat("\n=== 测试集性能 ===\n")
## 
## === 测试集性能 ===
predicted_test <- predict(svm_model, newdata = test_data, probability = TRUE)
prob_test <- attr(predicted_test, "probabilities")[,2]
actual_test <- test_data$rainfall

confusion_test <- confusionMatrix(predicted_test, actual_test, positive = "1")
print(confusion_test$table)
##           Reference
## Prediction   0   1
##          0  58  26
##          1  47 307
cat("测试集准确率:", round(confusion_test$overall["Accuracy"], 4), "\n")
## 测试集准确率: 0.8333
cat("测试集指标(精确率/召回率/F1):\n")
## 测试集指标(精确率/召回率/F1):
print(confusion_test$byClass[c("Precision", "Recall", "F1")])
## Precision    Recall        F1 
## 0.8672316 0.9219219 0.8937409
roc_test <- roc(response = as.numeric(as.character(actual_test)), predictor = prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
auc_test <- auc(roc_test)
cat("测试集 AUC:", round(auc_test, 4), "\n")
## 测试集 AUC: 0.8332
plot(roc_test, col = "red", lwd = 2, add = TRUE)
legend("bottomright", legend = c(paste("训练集 AUC =", round(auc_train, 4)),
                                 paste("测试集 AUC =", round(auc_test, 4))),
       col = c("blue", "red"), lwd = 2, bty = "n")